library(knitr)
## Warning: package 'knitr' was built under R version 4.0.5
Identify anomalies in the dataset = fraud detection
check whether there are any anomalies in the given sales dataset. The objective of this task being fraud detection.
You are a Data analyst at Carrefour Kenya and are currently undertaking a project that will inform the marketing department on the most relevant marketing strategies that will result in the highest no. of sales (total price including tax). Your project has been divided into four parts where you’ll explore a recent marketing dataset by performing various unsupervised learning techniques and later providing recommendations based on your insights.
Define the question, the metric for success, the context, experimental design taken. Read and explore the given dataset. Identify anomalies in the dataset = fraud detection
The data used for this project will inform the marketing department on the most relevant marketing strategies that will result in the highest no. of sales (total price including tax)
#install.packages("anomalize") # Anormally detection
library(anomalize)
## Warning: package 'anomalize' was built under R version 4.0.5
## == Use anomalize to improve your Forecasts by 50%! =============================
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.5
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tibbletime)
## Warning: package 'tibbletime' was built under R version 4.0.5
##
## Attaching package: 'tibbletime'
## The following object is masked from 'package:stats':
##
## filter
anom<-read.csv("C:/Users/Silvia/Downloads/Supermarket_Sales_Forecasting - Sales.csv")
# Previewing the first 6 rows
head(anom)
## Date Sales
## 1 1/5/2019 548.9715
## 2 3/8/2019 80.2200
## 3 3/3/2019 340.5255
## 4 1/27/2019 489.0480
## 5 2/8/2019 634.3785
## 6 3/25/2019 627.6165
# Previewing the datatypes of our data
str(anom)
## 'data.frame': 1000 obs. of 2 variables:
## $ Date : chr "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
## $ Sales: num 549 80.2 340.5 489 634.4 ...
# totalling sales on their common shared dates
anom_aggregate<-aggregate(anom$Sales,by=list(Date=anom$Date),FUN=sum)
head(anom_aggregate)
## Date x
## 1 1/1/2019 4745.181
## 2 1/10/2019 3560.949
## 3 1/11/2019 2114.963
## 4 1/12/2019 5184.764
## 5 1/13/2019 2451.204
## 6 1/14/2019 3966.617
#getting a dataframe of the frequency table of Date
date_table<-data.frame(table(anom$Date))
head(date_table)
## Var1 Freq
## 1 1/1/2019 12
## 2 1/10/2019 9
## 3 1/11/2019 8
## 4 1/12/2019 11
## 5 1/13/2019 10
## 6 1/14/2019 13
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks tibbletime::filter(), stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
# combining both dataframes
final_df<-merge(anom_aggregate,date_table,by.x= "Date", by.y="Var1")
final_df
## Date x Freq
## 1 1/1/2019 4745.1810 12
## 2 1/10/2019 3560.9490 9
## 3 1/11/2019 2114.9625 8
## 4 1/12/2019 5184.7635 11
## 5 1/13/2019 2451.2040 10
## 6 1/14/2019 3966.6165 13
## 7 1/15/2019 5944.2600 13
## 8 1/16/2019 4289.0820 10
## 9 1/17/2019 3142.7550 11
## 10 1/18/2019 2780.4735 9
## 11 1/19/2019 4914.7245 16
## 12 1/2/2019 1945.5030 8
## 13 1/20/2019 3655.4490 10
## 14 1/21/2019 2392.0995 8
## 15 1/22/2019 1704.7695 7
## 16 1/23/2019 5994.1875 17
## 17 1/24/2019 5402.0505 13
## 18 1/25/2019 4700.3670 17
## 19 1/26/2019 4457.5125 17
## 20 1/27/2019 4635.8970 14
## 21 1/28/2019 4999.7115 14
## 22 1/29/2019 3516.5655 12
## 23 1/3/2019 2078.1285 8
## 24 1/30/2019 2558.2620 9
## 25 1/31/2019 5232.4965 14
## 26 1/4/2019 1623.6885 6
## 27 1/5/2019 3536.6835 12
## 28 1/6/2019 3614.2050 9
## 29 1/7/2019 2834.2440 9
## 30 1/8/2019 5293.7325 18
## 31 1/9/2019 3021.3435 8
## 32 2/1/2019 2444.5365 6
## 33 2/10/2019 3141.0225 11
## 34 2/11/2019 4542.1530 8
## 35 2/12/2019 2998.9890 8
## 36 2/13/2019 934.2375 8
## 37 2/14/2019 2454.0915 8
## 38 2/15/2019 6830.7855 19
## 39 2/16/2019 2503.7670 8
## 40 2/17/2019 5299.5705 13
## 41 2/18/2019 1496.0295 7
## 42 2/19/2019 4228.1190 9
## 43 2/2/2019 4140.9480 14
## 44 2/20/2019 2706.4170 10
## 45 2/21/2019 1393.7385 6
## 46 2/22/2019 2442.3105 11
## 47 2/23/2019 2339.5890 8
## 48 2/24/2019 2722.4610 9
## 49 2/25/2019 4807.2360 16
## 50 2/26/2019 2408.1645 9
## 51 2/27/2019 5859.4515 14
## 52 2/28/2019 2097.0180 6
## 53 2/3/2019 5467.9275 14
## 54 2/4/2019 2439.4965 11
## 55 2/5/2019 3031.1295 12
## 56 2/6/2019 2905.4235 13
## 57 2/7/2019 7228.2105 20
## 58 2/8/2019 5084.6565 12
## 59 2/9/2019 3271.8945 13
## 60 3/1/2019 2634.3660 10
## 61 3/10/2019 3163.2300 12
## 62 3/11/2019 2961.2520 11
## 63 3/12/2019 3677.5515 12
## 64 3/13/2019 2063.6070 10
## 65 3/14/2019 7214.6340 18
## 66 3/15/2019 2942.4150 12
## 67 3/16/2019 3154.4730 9
## 68 3/17/2019 1976.2890 6
## 69 3/18/2019 1292.8335 7
## 70 3/19/2019 5740.3920 16
## 71 3/2/2019 6560.3055 18
## 72 3/20/2019 5458.2045 15
## 73 3/21/2019 1877.5155 6
## 74 3/22/2019 3179.1480 10
## 75 3/23/2019 4095.0420 11
## 76 3/24/2019 3477.4635 11
## 77 3/25/2019 2272.9665 9
## 78 3/26/2019 1962.5130 13
## 79 3/27/2019 2902.8195 10
## 80 3/28/2019 2229.4020 10
## 81 3/29/2019 4023.2430 8
## 82 3/3/2019 4853.1735 14
## 83 3/30/2019 4487.0595 11
## 84 3/4/2019 3894.4395 12
## 85 3/5/2019 6230.8785 17
## 86 3/6/2019 3092.5965 11
## 87 3/7/2019 1438.2585 9
## 88 3/8/2019 3125.3880 11
## 89 3/9/2019 7474.0470 16
# Renaming columns
names(final_df)<-c("Date","Total.Sales","count")
head(final_df)
## Date Total.Sales count
## 1 1/1/2019 4745.181 12
## 2 1/10/2019 3560.949 9
## 3 1/11/2019 2114.963 8
## 4 1/12/2019 5184.764 11
## 5 1/13/2019 2451.204 10
## 6 1/14/2019 3966.617 13
#Changing date column to Date format
final_df$Date<-mdy(final_df$Date)
str(final_df)
## 'data.frame': 89 obs. of 3 variables:
## $ Date : Date, format: "2019-01-01" "2019-01-10" ...
## $ Total.Sales: num 4745 3561 2115 5185 2451 ...
## $ count : int 12 9 8 11 10 13 13 10 11 9 ...
final_df1 <- final_df %>% select(Date,count)
final_df1
## Date count
## 1 2019-01-01 12
## 2 2019-01-10 9
## 3 2019-01-11 8
## 4 2019-01-12 11
## 5 2019-01-13 10
## 6 2019-01-14 13
## 7 2019-01-15 13
## 8 2019-01-16 10
## 9 2019-01-17 11
## 10 2019-01-18 9
## 11 2019-01-19 16
## 12 2019-01-02 8
## 13 2019-01-20 10
## 14 2019-01-21 8
## 15 2019-01-22 7
## 16 2019-01-23 17
## 17 2019-01-24 13
## 18 2019-01-25 17
## 19 2019-01-26 17
## 20 2019-01-27 14
## 21 2019-01-28 14
## 22 2019-01-29 12
## 23 2019-01-03 8
## 24 2019-01-30 9
## 25 2019-01-31 14
## 26 2019-01-04 6
## 27 2019-01-05 12
## 28 2019-01-06 9
## 29 2019-01-07 9
## 30 2019-01-08 18
## 31 2019-01-09 8
## 32 2019-02-01 6
## 33 2019-02-10 11
## 34 2019-02-11 8
## 35 2019-02-12 8
## 36 2019-02-13 8
## 37 2019-02-14 8
## 38 2019-02-15 19
## 39 2019-02-16 8
## 40 2019-02-17 13
## 41 2019-02-18 7
## 42 2019-02-19 9
## 43 2019-02-02 14
## 44 2019-02-20 10
## 45 2019-02-21 6
## 46 2019-02-22 11
## 47 2019-02-23 8
## 48 2019-02-24 9
## 49 2019-02-25 16
## 50 2019-02-26 9
## 51 2019-02-27 14
## 52 2019-02-28 6
## 53 2019-02-03 14
## 54 2019-02-04 11
## 55 2019-02-05 12
## 56 2019-02-06 13
## 57 2019-02-07 20
## 58 2019-02-08 12
## 59 2019-02-09 13
## 60 2019-03-01 10
## 61 2019-03-10 12
## 62 2019-03-11 11
## 63 2019-03-12 12
## 64 2019-03-13 10
## 65 2019-03-14 18
## 66 2019-03-15 12
## 67 2019-03-16 9
## 68 2019-03-17 6
## 69 2019-03-18 7
## 70 2019-03-19 16
## 71 2019-03-02 18
## 72 2019-03-20 15
## 73 2019-03-21 6
## 74 2019-03-22 10
## 75 2019-03-23 11
## 76 2019-03-24 11
## 77 2019-03-25 9
## 78 2019-03-26 13
## 79 2019-03-27 10
## 80 2019-03-28 10
## 81 2019-03-29 8
## 82 2019-03-03 14
## 83 2019-03-30 11
## 84 2019-03-04 12
## 85 2019-03-05 17
## 86 2019-03-06 11
## 87 2019-03-07 9
## 88 2019-03-08 11
## 89 2019-03-09 16
# Convert df to a tibble
final_df1 <- as_tibble(final_df1)
class(final_df1)
## [1] "tbl_df" "tbl" "data.frame"
df_anomalized <- final_df1 %>%
time_decompose(count, merge = TRUE) %>%
anomalize(remainder) %>%
time_recompose()
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 30 days
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
df_anomalized %>% glimpse()
## Rows: 89
## Columns: 11
## $ Date <date> 2019-01-01, 2019-01-02, 2019-01-03, 2019-01-04, 2019-01~
## $ count <int> 12, 8, 8, 6, 12, 9, 9, 18, 8, 9, 8, 11, 10, 13, 13, 10, ~
## $ observed <dbl> 12, 9, 8, 11, 10, 13, 13, 10, 11, 9, 16, 8, 10, 8, 7, 17~
## $ season <dbl> 0.883193879, 1.000751930, -2.026508631, 0.003578774, 0.0~
## $ trend <dbl> 10.03092, 10.16344, 10.29595, 10.42847, 10.56007, 10.691~
## $ remainder <dbl> 1.0858871, -2.1641886, -0.2694456, 0.5679495, -0.6329071~
## $ remainder_l1 <dbl> -14.79896, -14.79896, -14.79896, -14.79896, -14.79896, -~
## $ remainder_l2 <dbl> 15.41235, 15.41235, 15.41235, 15.41235, 15.41235, 15.412~
## $ anomaly <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "N~
## $ recomposed_l1 <dbl> -3.884847, -3.634772, -6.529515, -4.366910, -4.166053, -~
## $ recomposed_l2 <dbl> 26.32647, 26.57654, 23.68180, 25.84440, 26.04526, 25.615~
df_anomalized %>% plot_anomalies(ncol = 3, alpha_dots = 0.75)
#### Adjusting Trend and Seasonality
p1 <- df_anomalized %>%
plot_anomaly_decomposition() +
ggtitle("Freq/Trend = 'auto'")
p1
#When “auto” is used, a get_time_scale_template() is used to #determine the logical frequency and trend spans based on the scale #of the data. You can uncover the logic:
get_time_scale_template()
## # A tibble: 8 x 3
## time_scale frequency trend
## <chr> <chr> <chr>
## 1 second 1 hour 12 hours
## 2 minute 1 day 14 days
## 3 hour 1 day 1 month
## 4 day 1 week 3 months
## 5 week 1 quarter 1 year
## 6 month 1 year 5 years
## 7 quarter 1 year 10 years
## 8 year 5 years 30 years
p2 <- final_df1 %>%
time_decompose(count,
frequency = "auto",
trend = "2 weeks") %>%
anomalize(remainder) %>%
plot_anomaly_decomposition() +
ggtitle("Trend = 2 Weeks (Local)")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Warning in lubridate::floor_date(x, unit): Multi-unit not supported for weeks.
## Ignoring.
## Warning in lubridate::ceiling_date(x, unit): Multi-unit not supported for weeks.
## Ignoring.
## trend = 14 days
# Show plots
p1
p2
* After adjusting the trend using local parameters we can see some anomalies being detected.
#Adjusting globally by using set_time_scale_template() to update the #default template to one that we prefer. We’ll change the “3 month” #trend to “2 weeks” for time scale = “day”. Use time_scale_template() #to retrieve the time scale template that anomalize begins with, #mutate() the trend field in the desired location, and use #set_time_scale_template() to update the template in the global #options. We can retrieve the updated template using #get_time_scale_template() to verify the change has been executed #properly.
time_scale_template() %>%
mutate(trend = ifelse(time_scale == "day", "2 weeks", trend)) %>%
set_time_scale_template()
get_time_scale_template()
## # A tibble: 8 x 3
## time_scale frequency trend
## <chr> <chr> <chr>
## 1 second 1 hour 12 hours
## 2 minute 1 day 14 days
## 3 hour 1 day 1 month
## 4 day 1 week 2 weeks
## 5 week 1 quarter 1 year
## 6 month 1 year 5 years
## 7 quarter 1 year 10 years
## 8 year 5 years 30 years
#plotting to see changes
p3 <- final_df1 %>%
time_decompose(count) %>%
anomalize(remainder) %>%
plot_anomaly_decomposition() +
ggtitle("Trend = 2 Weeks (Global)")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Warning in lubridate::floor_date(x, unit): Multi-unit not supported for weeks.
## Ignoring.
## Warning in lubridate::ceiling_date(x, unit): Multi-unit not supported for weeks.
## Ignoring.
## trend = 14 days
p3
#Let’s reset the time scale template defaults back to the original #defaults.
time_scale_template() %>%
set_time_scale_template()
# Verify the change
get_time_scale_template()
## # A tibble: 8 x 3
## time_scale frequency trend
## <chr> <chr> <chr>
## 1 second 1 hour 12 hours
## 2 minute 1 day 14 days
## 3 hour 1 day 1 month
## 4 day 1 week 3 months
## 5 week 1 quarter 1 year
## 6 month 1 year 5 years
## 7 quarter 1 year 10 years
## 8 year 5 years 30 years
#Now, we can extract the actual datapoints which are anomalies. For #that, the following code can be run.
final_df1 %>%
time_decompose(count) %>%
anomalize(remainder) %>%
time_recompose() %>%
filter(anomaly == 'Yes')
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 30 days
## # A time tibble: 0 x 10
## # Index: Date
## # ... with 10 variables: Date <date>, observed <dbl>, season <dbl>,
## # trend <dbl>, remainder <dbl>, remainder_l1 <dbl>, remainder_l2 <dbl>,
## # anomaly <chr>, recomposed_l1 <dbl>, recomposed_l2 <dbl>
#We can adjust alpha, which is set to 0.05 by default. By default, #the bands just cover the outside of the range.
p4 <- final_df1 %>%
time_decompose(count) %>%
anomalize(remainder, alpha = 0.05, max_anoms = 0.2) %>%
time_recompose() %>%
plot_anomalies(time_recomposed = TRUE) +
ggtitle("alpha = 0.05")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 30 days
#> frequency = 7 days
#> trend = 91 days
p4
#If we decrease alpha, it increases the bands making it more #difficult to be an outlier. Here, you can see that the bands have #become twice big in size.
p5 <- final_df1 %>%
time_decompose(count) %>%
anomalize(remainder, alpha = 0.025, max_anoms = 0.2) %>%
time_recompose() %>%
plot_anomalies(time_recomposed = TRUE) +
ggtitle("alpha = 0.05")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 30 days
#> frequency = 7 days
#> trend = 91 days
p5
* Max Anoms
#The max_anoms parameter is used to control the maximum percentage of #data that can be an anomaly. Let’s adjust alpha = 0.3 so pretty much #anything is an outlier. Now let’s try a comparison between max_anoms #= 0.2 (20% anomalies allowed) and max_anoms = 0.05 (5% anomalies #allowed).
p6 <- final_df1 %>%
time_decompose(count) %>%
anomalize(remainder, alpha = 0.3, max_anoms = 0.2) %>%
time_recompose() %>%
plot_anomalies(time_recomposed = TRUE) +
ggtitle("20% Anomalies")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 30 days
#> frequency = 7 days
#> trend = 91 days
p7 <- final_df1 %>%
time_decompose(count) %>%
anomalize(remainder, alpha = 0.3, max_anoms = 0.05) %>%
time_recompose() %>%
plot_anomalies(time_recomposed = TRUE) +
ggtitle("5% Anomalies")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = Date
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## frequency = 7 days
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## Note: Index not ordered. tibbletime assumes index is in ascending order. Results may not be as desired.
## trend = 30 days
#> frequency = 7 days
#> trend = 91 days
p6
p7
* Adjusting the max anoms result in presence of anomalies in our data.
#Here, timetk’s plot_anomaly_diagnostics() function makes it possible #to tweak some of the parameters on the fly.
final_df1 %>% timetk::plot_anomaly_diagnostics(Date,count, .facet_ncol = 2)
## frequency = 7 observations per 1 week
## trend = 31 observations per 1 month
#To find the exact data points that are anomalies, we use #tk_anomaly_diagnostics() function.
final_df1 %>% timetk::tk_anomaly_diagnostics(Date, count) %>% filter(anomaly=='Yes')
## frequency = 7 observations per 1 week
## trend = 31 observations per 1 month
## # A tibble: 0 x 11
## # ... with 11 variables: Date <date>, observed <dbl>, season <dbl>,
## # trend <dbl>, remainder <dbl>, seasadj <dbl>, remainder_l1 <dbl>,
## # remainder_l2 <dbl>, anomaly <chr>, recomposed_l1 <dbl>, recomposed_l2 <dbl>